Abstract
Customer churn has been an important topic in the telecom industry because it directly affects company’s profitability. Churn describes that customers stop using a company’s product or service.
Data
Downloaded from Kaggle, the telecom customer churn dataset contains 7043 rows (customers) and 21 columns (features). Each row represents a customer and includes information about
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn import preprocessing
import seaborn as sns
from scipy.spatial import distance_matrix
from sklearn.cluster import KMeans
import missingno as msno
from scipy.spatial.distance import cdist
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
# import the csv file into the jupyter notebook by using pandas and show the dataframe.
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()
df.info()
len(df.customerID.unique())
# since all the record id are unique, which means there is no duplicate rows.
# Converting Total Charges to a numerical data type.
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()
#After we convert the TotalCharges into numerical data, there are 11 missing values.
#Here we will remove all the rows with missing values.
df.dropna(inplace=True)
#Since ID is some random unique identity, which will not have much influence on the final result.
df1 = df.drop('customerID',axis=1)
#Convertin the predictor variable in a binary numeric variable
# IF 'Yes', then 1
# IF 'No', then 0
df1['Churn'] = df1['Churn'].map({'Yes' : 1 , 'No' : 0 })
df1.head()
#Let's
df_dummies = pd.get_dummies(df1)
df_dummies.head()
plt.style.use("ggplot")
sns.set_style('whitegrid')
plt.figure(figsize=(15,8))
plt.title('The correlation between target and features')
df_dummies.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')
gender, SeniorCitizen, Partner, Dependents, PhoneService, PaperlessBilling
plt.figure(figsize=(15, 15))
plt.subplot(3, 2, 1)
sns.countplot('gender', data=df1, hue='Churn')
plt.title('The distribution of gender', fontsize=13)
plt.tight_layout()
plt.subplot(3, 2, 2)
sns.countplot('SeniorCitizen', data=df1, hue='Churn')
plt.title('The distribution of SeniorCitizen', fontsize=13)
plt.tight_layout()
plt.subplot(3, 2, 3)
sns.countplot('Partner', data=df1, hue='Churn')
plt.title('The distribution of Partner', fontsize=13)
plt.tight_layout()
plt.subplot(3, 2, 4)
sns.countplot('Dependents', data=df1, hue='Churn')
plt.title('The distribution of Dependents', fontsize=13)
plt.tight_layout()
plt.subplot(3, 2, 5)
sns.countplot('PhoneService', data=df1, hue='Churn')
plt.title('The distribution of PhoneService', fontsize=13)
plt.tight_layout()
plt.subplot(3, 2, 6)
sns.countplot('PaperlessBilling', data=df1, hue='Churn')
plt.title('The distribution of PaperlessBilling', fontsize=13)
plt.tight_layout()
Interpretations:
partner = pd.DataFrame({'Partner':['Yes','No'],
'Count':[len(df[df['Partner']=='Yes']),len(df[df['Partner']=='No'])]})
dependent = pd.DataFrame({'Dependents':['Yes','No'],
'Count':[len(df[df['Dependents']=='Yes']),len(df[df['Dependents']=='No'])]})
colors = ['#FF9D65','#FFDC66']
plt.figure(figsize=(10, 8))
plt.subplot(1, 2, 1)
plt.title("Pie chart of Customer's Partner Status", fontsize=15)
plt.pie(data=partner,x='Count',labels='Partner',startangle=90, textprops={'fontsize': 14},
colors=colors, wedgeprops={'edgecolor': 'black'}, autopct='%1.f%%', shadow=True)
plt.legend(title='Partners Status',loc="best")
plt.tight_layout()
plt.subplot(1, 2, 2)
plt.title("Pie chart of Customer's Dependents Status", fontsize=15)
plt.pie(data=dependent,x='Count',labels='Dependents',startangle=90, textprops={'fontsize': 14},
colors=colors, wedgeprops={'edgecolor': 'black'}, autopct='%1.f%%', shadow=True)
plt.legend(title='Dependents Status',loc="best")
plt.tight_layout()
df1 = df[df['Partner']=='No']
df2 = df[df['Partner']=='Yes']
partner_yes = pd.DataFrame({'Dependents':['Yes','No'],
'Count':[len(df2[df2['Dependents']=='Yes']),len(df2[df2['Dependents']=='No'])]})
partner_no = pd.DataFrame({'Dependents':['Yes','No'],
'Count':[len(df1[df1['Dependents']=='Yes']),len(df1[df1['Dependents']=='No'])]})
plt.figure(figsize=(10, 8))
plt.subplot(1, 2, 1)
plt.pie(data=partner_yes,x='Count',labels='Dependents',startangle=90, textprops={'fontsize': 14},
colors=colors, wedgeprops={'edgecolor': 'black'}, autopct='%1.f%%', shadow=True)
plt.title('The Dependents Distribution of Customer with Partner', fontsize=13)
plt.legend(title='Dependents Status',loc="best")
plt.tight_layout()
plt.subplot(1, 2, 2)
plt.pie(data=partner_no,x='Count',labels='Dependents',startangle=90, textprops={'fontsize': 14},
colors=colors, wedgeprops={'edgecolor': 'black'}, autopct='%1.f%%', shadow=True)
plt.title('The Dependents Distribution of Customer with no Partner', fontsize=13)
plt.legend(title='Dependents Status',loc="best")
plt.tight_layout()
We can see from above, customer who has a partner, only half of them have dependents, while other half do not have any independents. Additionally, as expected, among the customers who do not have any partner, a majority (90%) of them do not have any dependents.
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
df1[df1.Churn == 0].MonthlyCharges.hist(bins=35, alpha=0.5, label='Churn=No')
df1[df1.Churn == 1].MonthlyCharges.hist(bins=35, alpha=0.5, label='Churn=Yes')
plt.xlabel('Monthly Payment')
plt.ylabel('count')
plt.legend()
plt.title('Histogram of Monthly Charges', fontsize=13)
plt.tight_layout()
plt.subplot(1, 2, 2)
df1[df1.Churn == 0].TotalCharges.hist(bins=35, alpha=0.6, label='Churn=No')
df1[df1.Churn == 1].TotalCharges.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.xlabel('Total Payment')
plt.ylabel('count')
plt.legend()
plt.title('Histogram of Total Charges', fontsize=13)
plt.tight_layout()
plt.figure(figsize=(7,5))
sns.scatterplot(data=df1, x='MonthlyCharges', y='TotalCharges')
plt.title('The relationship between MonthlyCharges and TotalCharges')
plt.tight_layout()
Total charges and monthly charges are two related features, if one is important to the target, another must also be important.
df1[df1.Churn == 0].tenure.hist(bins=35, alpha=0.5, label='Churn=No')
df1[df1.Churn == 1].tenure.hist(bins=35, alpha=0.5, label='Churn=Yes')
plt.xlabel('Tenure (month)')
plt.ylabel('count')
plt.legend()
plt.title('Histogram of Tenure', fontsize=13)
plt.tight_layout()
From the histogram above we can see that a lot of customers have been with the telecom company for just a month, while quite a many are there for about 72 months. This could be potentially because different customers have different contracts. Thus based on the contract they are into it could be more/less easier for the customers to stay/leave the telecom company. And also it seems like this feature has a great influence on our target.
import plotly.express as px
fig = px.histogram(df, x='Contract', color='Contract', title='The distribution of Contract', template='none')
fig.show()
Most of the customers are in the month-to-month contract which can verify the tenure distribution above. There are equal numbers of customers are in one-year and two-year contract.
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, sharey = True, figsize = (20,6))
ax = sns.distplot(df[df['Contract']=='Month-to-month']['tenure'],
hist=True, kde=False,
bins=int(180/5), color = '#FF9935',
hist_kws={'edgecolor':'black'},
ax=ax1)
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('Month to Month Contract')
ax = sns.distplot(df[df['Contract']=='One year']['tenure'],
hist=True, kde=False,
bins=int(180/5), color = 'steelblue',
hist_kws={'edgecolor':'black'},
ax=ax2)
ax.set_xlabel('Tenure (months)',size = 14)
ax.set_title('One Year Contract',size = 14)
ax = sns.distplot(df[df['Contract']=='Two year']['tenure'],
hist=True, kde=False,
bins=int(180/5), color = 'darkblue',
hist_kws={'edgecolor':'black'},
ax=ax3)
ax.set_xlabel('Tenure (months)')
ax.set_title('Two Year Contract')
From here it's easy to tell most of the monthly contracts last for 1-2 months, while the 2 year contracts tend to last for about 70 months. This shows that the customers taking a longer contract are more loyal to the company and tend to stay with it for a longer period of time.
This is also what we saw in the earlier chart on correlation with the churn rate.
Churn: if a customer leave the company
ax = (df1['Churn'].value_counts()*100.0 /len(df1)).plot(kind='bar',stacked = True, rot = 0, color = colors,figsize = (8,6))
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('% Customers',size = 14)
ax.set_xlabel('Churn',size = 14)
ax.set_title('Churn Rate', size = 14)
# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
totals.append(i.get_width())
# set individual bar lables using above list
total = sum(totals)
for i in ax.patches:
# get_width pulls left or right; get_y pushes up or down
ax.text(i.get_x()+.15, i.get_height()-4.0, \
str(round((i.get_height()/total), 1))+'%',
fontsize=12,
color='white',
weight = 'bold',
size = 14)
In our data, 74% of the customers do not churn. Clearly the data is skewed as we would expect a large majority of the customers to not churn. This is important to keep in mind for our modelling as skeweness could lead to a lot of false negatives. We will see in the modelling section on how to avoid skewness in the data.
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])
# Scaling all the variables to a range of 0 to 1
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
model = RandomForestClassifier()
results = cross_val_score(model, X_train, y_train, cv=3)
print(f"Accuracy: {round(results.mean()*100, 2)}%")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [int(x) for x in np.linspace(start=10, stop=80, num=10)],
'max_features': range(2, 12),
'max_features': ['auto','sqrt'],
'min_samples_split': [2,5],
'min_samples_leaf': [1,2],
'bootstrap': [True, False]}
rdf = GridSearchCV(RandomForestClassifier(), param_grid, scoring='accuracy')
rdf.fit(X_train, y_train)
print('The score of the grid is:',rdf.best_score_)
# extract the best parameter
for key, val in rdf.best_params_.items():
print(f"Best hyperparameter is {key}: {val}")
best_RF_model = rdf.best_estimator_
from sklearn.pipeline import make_pipeline
rf_pipe = make_pipeline(best_RF_model)
rf_pipe.fit(X_train, y_train)
pipe_pred_rf = rf_pipe.predict(X_test)
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=True, random_state=16)
print('cv_acc_mean',round(cross_val_score(rf_pipe, X_train, y_train, cv=cv).mean(),4))
print('cv_acc_std',round(cross_val_score(rf_pipe, X_train, y_train, cv=cv).std(),4))
y_pred_rf = best_RF_model.predict(X_test)
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, y_pred_rf)
ax = plt.subplot()
sns.heatmap(mat.T, square=True, annot=True, annot_kws={"size": 13},cbar=False,fmt="d",
cmap= "Blues", linewidths=.2)
ax.set_ylim([0,2])
plt.title('RF Confusion Matrix',y=-0.1, fontsize=13)
plt.xlabel('True label',y=-0.1, fontsize=13)
plt.ylabel('Predicted label', fontsize=13)
fig = plt.gcf()
fig.set_size_inches(8, 9, forward=True)
from sklearn.metrics import classification_report
# Evaluate our model
print(classification_report(y_test,y_pred_rf))
print(f"Precision: {round(metrics.precision_score(y_test,y_pred_rf)*100,2)}%")
print(f"Recall: {round(metrics.recall_score(y_test,y_pred_rf)*100,2)}%")
print(f"Accuracy: {round(metrics.accuracy_score(y_test,y_pred_rf)*100,2)}%")
print(f"F1-score: {round(metrics.f1_score(y_test,y_pred_rf)*100,2)}%")
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# Predict probabilities for the test data.
probs = rf_pipe.predict_proba(X_test)
#Keep Probabilities of the positive class only
probs = probs[:, 1]
#Get the ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
print("Area Under Curve (AUC) = ",metrics.auc(fpr,tpr))
# define the roc curve function
def plot_roc_curve(fpr, tpr):
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr, color='red', label='AUC = %0.2f' % metrics.auc(fpr,tpr))
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlabel('False Positive Rate',fontdict={'size':12})
plt.ylabel('True Positive Rate', fontdict={'size':12})
plt.title('ROC curve - RF Model', fontsize=15)
plt.legend()
plt.show()
# Plot ROC Curve using our defined function
plot_roc_curve(fpr, tpr)
feature_imp = pd.DataFrame(best_RF_model.feature_importances_, index=X_train.columns,
columns=['importance']).sort_values('importance', ascending=False)
feature_imp = feature_imp.reset_index()
feature_imp = feature_imp.rename(columns={"index": "Feature"})
feature_imp
importances_rf = best_RF_model.feature_importances_
feat_importances_rf = pd.Series(best_RF_model.feature_importances_, index=X.columns)
plt.figure(figsize=(8,5))
feat_importances_rf.nlargest(10).plot(kind='barh')
Interpretation: Tenure, total charges and monthly charges are necessary predictors. Tenure means a customer is tenure for how long. This is obvious to see, when a customer is tenure for a long time may mean he would like to spend more time on this platform, which is the same as the charges. A customer is more willing to pay for the platform, he will be more willing to stay. Also, monthly charges and total charges are two related features, it’s not surprising to see they are both in the top predictors list. For other top predictors, we can see, customers who have no technical support, no online security and who prefer using checks for payment are more likely to stay.
# Importing the KNN model
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Accuracy of base KNN: {round(metrics.accuracy_score(y_test, y_pred)*100, 2)}%")
Ks = 50
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
yhat=neigh.predict(X_test)
mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
plt.figure(figsize=(8,5))
plt.plot(range(1,Ks),mean_acc,'g',color='blue')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.title('The accuracy for different number of neighbors')
plt.tight_layout()
plt.show()
Number of neighbors (K)
Distance functions
Weight Options
CV = 10 & scoring = 'accuracy'
# Possible hyperparamter options for KNN
parameters = {
'n_neighbors':[4,5,6],
'metric':['minkowski','euclidean','manhattan'],
'weights':['uniform','distance']
}
Grid_KNN = GridSearchCV(KNeighborsClassifier(),parameters, cv=10,scoring='accuracy')
Grid_KNN.fit(X_train, y_train)
best_KNN_model = Grid_KNN.best_estimator_
# helper function for printing out grid search results
def print_grid_search_metrics(gs):
print ("Best score: " + str(gs.best_score_))
print ("Best parameters set")
best_parameters = gs.best_params_
for param_name in sorted(best_parameters.keys()):
print(param_name,':',str(best_parameters[param_name]))
# print best parameter
print_grid_search_metrics(Grid_KNN)
from sklearn.pipeline import make_pipeline
knn_pipe = make_pipeline(best_KNN_model)
knn_pipe.fit(X_train, y_train)
pipe_pred = knn_pipe.predict(X_test)
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=True, random_state=16)
print('cv_acc_mean',round(cross_val_score(knn_pipe, X_train, y_train, cv=cv).mean(),4))
print('cv_acc_std',round(cross_val_score(knn_pipe, X_train, y_train, cv=cv).std(),4))
y_pred_KNN = best_KNN_model.predict(X_test)
mat = confusion_matrix(y_test, y_pred_KNN)
ax = plt.subplot()
sns.heatmap(mat.T, square=True, annot=True, annot_kws={"size": 13},cbar=False,fmt="d",
cmap= 'Blues', linewidths=.2)
ax.set_ylim([0,2])
plt.title('KNN Confusion Matrix',y=-0.1, fontsize=13)
plt.xlabel('True label',y=-0.1, fontsize=13)
plt.ylabel('Predicted label', fontsize=13)
fig = plt.gcf()
fig.set_size_inches(8, 9, forward=True)
print(f"Precision: {round(metrics.precision_score(y_test,y_pred_KNN)*100,2)}%")
print(f"Recall: {round(metrics.recall_score(y_test,y_pred_KNN)*100,2)}%")
print(f"Accuracy: {round(metrics.accuracy_score(y_test,y_pred_KNN)*100,2)}%")
print(f"F1-score: {round(metrics.f1_score(y_test,y_pred_KNN)*100,2)}%")
print(metrics.classification_report(y_test, y_pred_KNN))
# Use predict_proba to get the probability results of KNN Model
from sklearn.metrics import roc_curve
y_pred_knn = best_KNN_model.predict_proba(X_test)[:, 1]
fpr_knn, tpr_knn, thresh = roc_curve(y_test, y_pred_knn)
# AUC score
metrics.auc(fpr_knn,tpr_knn)
# ROC Curve
plt.figure(1)
plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_knn, tpr_knn, label='AUC = %0.2f' % metrics.auc(fpr_knn,tpr_knn))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - KNN Model')
plt.legend(loc='best')
plt.show()
Interpretation: The model performance isn’t greatly improved with these best parameters. The f1-score of the model 0.48 is the lowest among all the model. One of the possible reasons is that we have a mixture of categorical and continuous variables, which affect the models’ ability to reflect the distances among datapoints.
from sklearn.ensemble import GradientBoostingClassifier as gbm
num_trees = 100
gbm_model = gbm(n_estimators=num_trees, random_state=16)
results = cross_val_score(gbm_model, X_train, y_train)
print(f"Accuracy for GBM: {round(results.mean()*100, 2)}%")
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate': np.arange(0.02, 0.1, 0.02),
'n_estimators': range(60, 160, 50),
'max_depth': range(2, 10),
'loss':['deviance','exponential'],
'subsample':[1,0.5]}
gbm_clf = GridSearchCV(gbm(), param_grid)
gbm_clf.fit(X_train, y_train)
# helper function for printing out grid search results
def print_grid_search_metrics(gs):
print ("Best score: " + str(gs.best_score_))
print ("Best parameters set")
best_parameters = gs.best_params_
for param_name in sorted(best_parameters.keys()):
print(param_name,':',str(best_parameters[param_name]))
# print best parameter
print_grid_search_metrics(gbm_clf)
# best gradient boosting machine
best_GBM_model = gbm_clf.best_estimator_
gbm_pipe = make_pipeline(best_GBM_model)
gbm_pipe.fit(X_train, y_train)
pipe_pred = gbm_pipe.predict(X_test)
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=True, random_state=16)
print('cv_acc_mean',round(cross_val_score(gbm_pipe, X_train, y_train, cv=cv).mean(),4))
print('cv_acc_std',round(cross_val_score(gbm_pipe, X_train, y_train, cv=cv).std(),4))
y_pred_gbm = best_GBM_model.predict(X_test)
mat = confusion_matrix(y_test, y_pred_gbm)
ax = plt.subplot()
sns.heatmap(mat.T, square=True, annot=True, annot_kws={"size": 13},cbar=False,fmt="d",
cmap= 'Blues', linewidths=.2)
ax.set_ylim([0,2])
plt.title('GBM Confusion Matrix',y=-0.1, fontsize=13)
plt.xlabel('True label',y=-0.1, fontsize=13)
plt.ylabel('Predicted label', fontsize=13)
fig = plt.gcf()
fig.set_size_inches(8, 9, forward=True)
print(f"Precision: {round(metrics.precision_score(y_test,y_pred_gbm)*100,2)}%")
print(f"Recall: {round(metrics.recall_score(y_test,y_pred_gbm)*100,2)}%")
print(f"Accuracy: {round(metrics.accuracy_score(y_test,y_pred_gbm)*100,2)}%")
print(f"F1-score: {round(metrics.f1_score(y_test,y_pred_gbm)*100,2)}%")
print(metrics.classification_report(y_test,y_pred_gbm))
# Use predict_proba to get the probability results of KNN Model
y_pred_gbm = best_GBM_model.predict_proba(X_test)[:, 1]
fpr_gbm, tpr_gbm, thresh = roc_curve(y_test, y_pred_gbm)
# AUC score
print("Area Under Curve (AUC) = ",metrics.auc(fpr_gbm,tpr_gbm))
# ROC Curve
plt.figure(1)
plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_gbm, tpr_gbm, label='AUC = %0.2f' % metrics.auc(fpr_gbm,tpr_gbm))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - GBM Model')
plt.legend(loc='best')
plt.show()
importances_gbm = best_GBM_model.feature_importances_
feat_importances_gbm = pd.Series(best_GBM_model.feature_importances_, index=X.columns)
plt.figure(figsize=(8,5))
feat_importances_gbm.nlargest(10).plot(kind='barh')
Interpretation: The three most important features obtained from the GBM are “Contract month to month”, “tenure”, and “Internet Service Fiber Optic”. Compared to the feature importance obtained from the Random Forest model, “tenure” is still among the top 3. The other two features are more important in the GBM than in RF. For users who have month to month contracts, they have better mobility than those who have a fixed term contract with telecom companies. For this group of users, they don’t have to worry about the consequences of breaking a contract when they make decisions to leave companies. For users who have internet service fiber optic, they are also more likely to churn, which indicates that customers’ satisfaction to internet services is critical in retaining customers.
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print(f"Accuracy of base Logistic Regression: {round(metrics.accuracy_score(y_test, y_pred)*100, 2)}%")
parameters = {
'C':np.geomspace(1e-5,1e5,num=20),
'penalty':["l1","l2"]
}
Grid_LR = GridSearchCV(LR,parameters, cv=10,scoring='accuracy')
Grid_LR.fit(X_train, y_train)
def print_grid_search_metrics(gs):
print ("Best score: " + str(gs.best_score_))
print ("Best parameters set")
best_parameters = gs.best_params_
for param_name in sorted(best_parameters.keys()):
print(param_name,':',str(best_parameters[param_name]))
print_grid_search_metrics(Grid_LR)
best_LR_model = Grid_LR.best_estimator_
LR_pipe = make_pipeline(best_LR_model)
LR_pipe.fit(X_train, y_train)
pipe_pred = LR_pipe.predict(X_test)
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=True, random_state=16)
print('cv_acc_mean',round(cross_val_score(LR_pipe, X_train, y_train, cv=cv).mean(),4))
print('cv_acc_std',round(cross_val_score(LR_pipe, X_train, y_train, cv=cv).std(),4))
y_pred_LR = best_LR_model.predict(X_test)
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, y_pred_LR)
ax = plt.subplot()
sns.heatmap(mat.T, square=True, annot=True, annot_kws={"size": 13},cbar=False,fmt="d",
cmap= 'Blues', linewidths=.2)
ax.set_ylim([0,2])
plt.title('LR Confusion Matrix',y=-0.1, fontsize=13)
plt.xlabel('True label',y=-0.1, fontsize=13)
plt.ylabel('Predicted label', fontsize=13)
fig = plt.gcf()
fig.set_size_inches(8, 9, forward=True)
print(f"Precision: {round(metrics.precision_score(y_test,y_pred_LR)*100,2)}%")
print(f"Recall: {round(metrics.recall_score(y_test,y_pred_LR)*100,2)}%")
print(f"Accuracy: {round(metrics.accuracy_score(y_test,y_pred_LR)*100,2)}%")
print(f"F1-score: {round(metrics.f1_score(y_test,y_pred_LR)*100,2)}%")
print(metrics.classification_report(y_test, y_pred_LR))
y_pred_LR = best_LR_model.predict_proba(X_test)[:, 1]
fpr_LR, tpr_LR, thresh = roc_curve(y_test, y_pred_LR)
roc_auc = metrics.auc(fpr_LR, tpr_LR)
print("Area Under Curve (AUC) = ",roc_auc)
plt.figure(1)
plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_LR, tpr_LR,label='AUC = %0.2f' % metrics.auc(fpr_LR, tpr_LR))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - Logistic Regression Model')
plt.legend(loc='best')
plt.show()
feat_importances_lr = pd.DataFrame(best_LR_model.coef_[0], index=X_train.columns,
columns=['coefficient']).sort_values('coefficient', ascending=False)
feat_importances_lr = feat_importances_lr.reset_index()
feat_importances_lr = feat_importances_lr.rename(columns={"index": "Feature"})
feat_importances_lr['abs'] = feat_importances_lr['coefficient'].abs()
feat_importances_lr
plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.barh(data=feat_importances_lr.sort_values(ascending=False, by='coefficient')[:10], width='coefficient',y='Feature')
plt.ylabel('coefficient',fontsize=12)
plt.xlabel('feature',fontsize=12)
plt.title('Coefficient for Logistic Regression (Top 10)',fontsize=13)
#plt.xticks(rotation=45)
plt.tight_layout()
plt.subplot(1, 2, 2)
plt.barh(data=feat_importances_lr.sort_values(ascending=True, by='coefficient')[:10], width='coefficient',y='Feature')
plt.ylabel('coefficient',fontsize=12)
plt.xlabel('feature',fontsize=12)
plt.title('Coefficient for Logistic Regression (Bottom 10)',fontsize=13)
#plt.xticks(rotation='vertical')
plt.tight_layout()
plt.figure(figsize=(8,5))
plt.barh(data=feat_importances_lr.sort_values(ascending=False, by='abs')[:10], width='abs',y='Feature')
plt.ylabel('coefficient(abs)',fontsize=12)
plt.xlabel('feature',fontsize=12)
plt.title('Abosulte Value of Coefficient for Logistic Regression (Top 10)',fontsize=13)
plt.xticks(rotation='vertical')
Interpretation: The top 10 important features of Logistic Regression model are tenure, TotalCharges, MonthlyCharges, Contract_Two year, Contract_Month-to-month, InternetService_Fiber optic, InternetService_DSL, PaymentMethod_Electronic check, StreamingTV_Yes and OnlineSecurity_No.
from sklearn import svm
from sklearn.svm import SVC
from sklearn import model_selection
LS = svm.LinearSVC()
LS
bs = model_selection.ShuffleSplit(n_splits=25,test_size=0.3,random_state=0)
param_grid = {'C':[0.25,0.5,0.75,1],'penalty':['l2']}
Grid_LS = GridSearchCV(LS,param_grid,cv=bs)
Grid_LS.fit(X_train,y_train)
def print_grid_search_metrics(gs):
print ("Best score: " + str(gs.best_score_))
print ("Best parameters set")
best_parameters = gs.best_params_
for param_name in sorted(best_parameters.keys()):
print(param_name,':',str(best_parameters[param_name]))
print_grid_search_metrics(Grid_LS)
best_LS_model = Grid_LS.best_estimator_
from sklearn.pipeline import make_pipeline
LS_pipe = make_pipeline(best_LS_model)
LS_pipe.fit(X_train, y_train)
pipe_pred = LS_pipe.predict(X_test)
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=True, random_state=16)
print('cv_acc_mean',round(cross_val_score(LS_pipe, X_train, y_train, cv=cv).mean(),4))
print('cv_acc_std',round(cross_val_score(LS_pipe, X_train, y_train, cv=cv).std(),4))
y_pred_LS = best_LS_model.predict(X_test)
mat = confusion_matrix(y_test, y_pred_LS)
ax= plt.subplot()
sns.heatmap(mat.T, square=True, annot=True, annot_kws={"size": 13},cbar=False,fmt="d",
cmap= 'Blues', linewidths=.2)
ax.set_ylim([0,2])
plt.title('Linear SVM Confusion Matrix',y=-0.1, fontsize=13)
plt.xlabel('True label',y=-0.1, fontsize=13)
plt.ylabel('Predicted label', fontsize=13)
fig = plt.gcf()
fig.set_size_inches(8, 9, forward=True)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, y_pred_LS)*100, 2)}%")
print(f"Precision: {round(metrics.precision_score(y_test, y_pred_LS)*100, 2)}%")
print(f"Recall: {round(metrics.recall_score(y_test, y_pred_LS)*100,2)}%")
print(f"F1-Score: {round(metrics.f1_score(y_test, y_pred_LS)*100, 2)}%")
print(metrics.classification_report(y_test, y_pred_LS))
y_pred_LS = best_LS_model._predict_proba_lr(X_test)[:, 1]
fpr_LS, tpr_LS, thresh = roc_curve(y_test, y_pred_LS)
roc_auc = metrics.auc(fpr_LS, tpr_LS)
print("Area Under Curve (AUC) = ",roc_auc)
plt.figure(1)
plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_LS, tpr_LS,label='AUC = %0.2f' % metrics.auc(fpr_LS,tpr_LS))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - Linear SVM Model')
plt.legend(loc='best')
plt.show()
Interpretation: Using GridSearchCV( ), we got the best hyper parameters set with C of 0 and penalty of l2. From the classification report results we observed that SVM, Gradient Boosting Machine, and Random Forest algorithm have similar performance. Their precisions are quite closed, which is around 0.8.
from plotly import graph_objects as go
fig = go.Figure(layout = {"title": "Model Performance Master Table"},
data=[go.Table(header=dict(values=["Algorithm","Key hyperparamters","Precision","Recall","Accuracy","F1-Score","ROC AUC Score"]),
cells=dict(values=[["Logistic Regression","GBM","Linear SVM","Random Forest","KNN"],
[
"C:6.158, penalty:l2",
"learning_rate:0.06, loss:exponential, max_depth:2, n_estimators:110, subsample:0.5",
"C:0.5, penalty:l2",
"bootstrap:True, max_feature:auto, min_sample_leaf:2, min_sample_split:5, n_estimators:25",
"metric:manhattan, n_neighbors:6, weights:uniform"
],
["65.55%","66.20%","66.08%","65.70%","58.37%"], #Precision
["55.21%","53.52%","52.68%","51.27%","40.28%"], #Recall
["81.38%","81.38%","81.24%","80.95%","77.68%"], #Accuracy
["59.94%","59.40%","58.40%","55.87%","47.67%"], #F1-score
["84.08%","84.28%","83.55%","82.84%","78.31%"], #ROC_AUC_Score
]))
])
fig.update_layout(width=1000,height=550)
fig.show()
Using F1-scores as the key evaluation metrics, the rankings of our machine learning models are: Logistic Regression (59.94%), Gradient Boosting Machine (59.40%), Linear SVM (58.40%), Random Forest (55.87%), and K-Nearest Neighbor (47.76%). Logistic regression has several advantages when we apply it to real business problems. It is easy to implement, interpret and efficient to train. Running logistic regression is not as time consuming as running other models such as support vector machine and gradient boosting machine.